library(tidyverse)
steam <- read_csv("raw_data/steam_checkpoint_2.csv")
Rows: 26564 Columns: 27── Column specification ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (10): name, developer, publisher, controller_support, multiplayer, categories, genres, steamspy_tags, general_rating, owners
dbl (9): appid, required_age, achievements, positive_ratings, negative_ratings, average_playtime, median_playtime, price, total_reviews
lgl (7): free_to_play, virtual_reality_support, steam_workshop, singleplayer, windows_support, mac_support, linux_support
date (1): release_date
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.Error in exists(cacheKey, where = .rs.WorkingDataEnv, inherits = FALSE) :
invalid first argument
Error in assign(cacheKey, frame, .rs.CachedDataEnv) :
attempt to use zero-length variable name
steam
steam %>%
select(name, owners)
#For the purposes of making a model, what would we want the target to
be? If we were to create a column called “successful”, what variables
would it look for?
steam %>%
separate(col = steamspy_tags, sep = ";", into = c("tag_1", "tag_2", "tag_3")) %>%
pivot_longer(cols = c("tag_1", "tag_2", "tag_3"), values_to = "tags", names_to = "drop_me") %>%
drop_na(tags) %>%
group_by(tags) %>%
count(tags) %>%
arrange(desc(n))
Warning: Expected 3 pieces. Missing pieces filled with `NA` in 2551 rows [1283, 3566, 3669, 3711, 4035, 4146, 4457, 4733, 4822, 5019, 5140, 5142, 5264, 5361, 5378, 5497, 5553, 5556, 5570, 5756, ...].
# creating new logical columns for use in the model - we cant have every genre/tag, so a selection of a few will have to do
# Preferably ones without a lot of overlap
# Not including Indie just now, because anything developed by Dinkey game would be Indie by default - It's a tag that doesnt really describe anything about the game
steam <- steam %>%
mutate(is_open_world = ifelse(str_detect(steamspy_tags, "Open World"), TRUE, FALSE),
is_rogue_like = ifelse(str_detect(steamspy_tags, "Rogue-like"), TRUE, FALSE),
is_metroidvania = ifelse(str_detect(steamspy_tags, "Metroidvania"), TRUE, FALSE),
is_visual_novel = ifelse(str_detect(steamspy_tags, "Visual Novel"), TRUE, FALSE),
has_great_soundtrack = ifelse(str_detect(steamspy_tags, "Great Soundtrack"), TRUE, FALSE),
is_shooter =ifelse(str_detect(steamspy_tags, "Shooter") | str_detect(steamspy_tags, "FPS") | str_detect(steamspy_tags, "Third-Person Shooter"), TRUE, FALSE),
is_rpg = ifelse(str_detect(steamspy_tags, "RPG"), TRUE, FALSE),
has_female_protagonist = ifelse(str_detect(steamspy_tags, "Female Protagonist"), TRUE, FALSE),
is_sports = ifelse(str_detect(steamspy_tags, "Sports"), TRUE, FALSE),
is_simulation = ifelse(str_detect(steamspy_tags, "Simulation"), TRUE, FALSE),
.after = multiplayer)
Error in exists(cacheKey, where = .rs.WorkingDataEnv, inherits = FALSE) :
invalid first argument
Error in assign(cacheKey, frame, .rs.CachedDataEnv) :
attempt to use zero-length variable name
steam <- steam %>%
mutate(is_positive = ifelse(str_detect(general_rating, "Positive"), TRUE, FALSE),
.after = linux_support)
Error in exists(cacheKey, where = .rs.WorkingDataEnv, inherits = FALSE) :
invalid first argument
Error in assign(cacheKey, frame, .rs.CachedDataEnv) :
attempt to use zero-length variable name
For the purpose of my model over in Python land, i’m turning
Multiplayer into just a logical to see if it changes anything
steam %>%
distinct(multiplayer)
steam <- steam %>%
mutate(multiplayer = ifelse(str_detect(multiplayer, "No multiplayer"), FALSE, TRUE))
Error in exists(cacheKey, where = .rs.WorkingDataEnv, inherits = FALSE) :
invalid first argument
Error in assign(cacheKey, frame, .rs.CachedDataEnv) :
attempt to use zero-length variable name
#ok fine im adding indie to see what happens
steam <- steam %>%
mutate(is_indie = ifelse(str_detect(steamspy_tags, "Indie"), TRUE, FALSE))
Error in exists(cacheKey, where = .rs.WorkingDataEnv, inherits = FALSE) :
invalid first argument
Error in assign(cacheKey, frame, .rs.CachedDataEnv) :
attempt to use zero-length variable name
steam %>%
filter(multiplayer == FALSE)
# what if we bumped up the threshold? What if mostly positive was no longer acceptable?
steam_but_harsher <- steam %>%
mutate(is_positive = case_when(
general_rating == "Extremely Positive" | general_rating == "Positive" ~ TRUE,
TRUE ~ FALSE
))
write_csv(steam, "clean_data/steam_for_model.csv")
write_csv(steam_but_harsher, "clean_data/harsher_steam_for_model.csv")
For now, im gonna do some text stuff using backloggd reviews
backloggd_reviews <- read_csv("clean_data/backloggd_reviews.csv")
Rows: 1509 Columns: 8── Column specification ─────────────────────────────────────────────────────────────
Delimiter: ","
chr (7): title, summary, reviews, genre_tag, genre_tag_2, genre_tag_3, genre_tag_4
dbl (1): rating
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# separating reviews into their own columns so i can manage them
reviews_separated <- backloggd_reviews %>%
separate_longer_delim(cols = "reviews", delim = "',\ \'") %>%
separate_longer_delim(cols = "reviews", delim = "\',\ \"") %>%
separate_longer_delim(cols = "reviews", delim = "\",\ \'") %>%
separate_longer_delim(cols = "reviews", delim = "\",\ \"") %>%
unique() %>%
# removing all the opening and closing rubbish
mutate(reviews = str_remove_all(string = reviews, pattern = "\\[\""),
reviews = str_remove_all(string = reviews, pattern = "\\[\'"),
reviews = str_remove_all(string = reviews, pattern = "\"\\]"),
reviews = str_remove_all(string = reviews, pattern = "\'\\]"))
reviews_separated %>%
group_by(title) %>%
count(title)
lets split this into ratings, like i did with the other backloggd
thing
reviews_separated <- reviews_separated %>%
mutate(rating_range = case_when(
rating < 5 & rating >= 4 ~ "4+",
rating < 4 & rating >= 3 ~ "3 to 4",
rating < 3 & rating >= 2 ~ "2 to 3",
rating < 2 & rating >= 1 ~ "1 to 2",
rating < 1 ~ ">1",
TRUE ~ "No rating"
))
Before doing text analysis, lets convert everything to lower case -
this might take away some context via capitalisation, but we only have 6
reviews per game so gotta work with what we have
reviews_lower_separated <- reviews_separated %>%
mutate(reviews = tolower(reviews)) #%>%
#filter(reviews %in% c("í", "ñ", "é", "á", "ó", "á") == FALSE)
library(textdata)
library(tidytext)
de_stopwords <- tibble(word = stopwords("de"))
Error in stopwords("de") : could not find function "stopwords"

reviews_3_to_lowest_stop <- reviews_lower_separated %>%
filter(rating_range == "2 to 3" | rating_range == "1 to 2" | rating_range == ">1") %>%
select(-summary) %>%
unnest_tokens(input = reviews, output = word) %>%
anti_join(filter(stop_words, lexicon == "SMART")) %>%
anti_join(game_stop_words) %>%
anti_join(pt_stopwords) %>%
anti_join(de_stopwords) %>%
count(word, sort = TRUE)
Joining with `by = join_by(word)`Joining with `by = join_by(word)`Joining with `by = join_by(word)`Joining with `by = join_by(word)`

bigrams might be more appropriate for bad reviews
bigrams_3_to_lowest_stop <- reviews_lower_separated %>%
filter(rating_range == "2 to 3" | rating_range == "1 to 2" | rating_range == ">1") %>%
unnest_tokens(bigram, reviews, token = "ngrams", n = 2) %>%
count(bigram, sort = TRUE) %>% # count bigrams
separate(bigram, into = c("word1", "word2"), sep = " ") %>% # split bigrams into two seperate columns
anti_join(stop_words, join_by("word1" == "word")) %>% # check if word1 of bigram is a stop word
anti_join(stop_words, join_by("word2" == "word")) %>%
anti_join(pt_stopwords, join_by("word1" == "word")) %>%
anti_join(pt_stopwords, join_by("word2" == "word")) %>%
anti_join(game_stop_words, join_by("word1" == "word")) %>%
anti_join(game_stop_words, join_by("word2" == "word")) %>%
anti_join(de_stopwords, join_by("word1" == "word")) %>%
anti_join(de_stopwords, join_by("word2" == "word")) %>%
drop_na() %>%
unite(col = "bigram", word1:word2, sep = " ", remove = TRUE) %>%
anti_join(game_stop_bigrams)
Joining with `by = join_by(bigram)`

well fine lets remove game names as well
ok that still kinda sucks
ggwordcloud(words = bigrams_3_to_5_no_stop$bigram, freq = bigrams_3_to_5_no_stop$n, random.color = TRUE, colors = c("#ff595e", "#ffca3a", "#8ac926", "#1982c4", "#6a4c93"))

ok what if we try some of that tf-idf stuff
``
---
title: "Day The Four"
output: html_notebook
---

```{r}
library(tidyverse)
```

```{r}
steam <- read_csv("raw_data/steam_checkpoint_2.csv")
```

```{r}
steam
```

```{r}
steam %>% 
  select(name, owners)
```

#For the purposes of making a model, what would we want the target to be? If we were to create a column called "successful", what variables would it look for?

```{r}

steam %>%
  separate(col = steamspy_tags, sep = ";", into = c("tag_1", "tag_2", "tag_3")) %>% 
  pivot_longer(cols = c("tag_1", "tag_2", "tag_3"), values_to = "tags", names_to = "drop_me") %>% 
  drop_na(tags) %>% 
  group_by(tags) %>% 
   count(tags) %>% 
   arrange(desc(n)) 


```

```{r}
# creating new logical columns for use in the model - we cant have every genre/tag, so a selection of a few will have to do
# Preferably ones without a lot of overlap
# Not including Indie just now, because anything developed by Dinkey game would be Indie by default - It's a tag that doesnt really describe anything about the game
steam <- steam %>% 
  mutate(is_open_world = ifelse(str_detect(steamspy_tags, "Open World"), TRUE, FALSE),
         is_rogue_like = ifelse(str_detect(steamspy_tags, "Rogue-like"), TRUE, FALSE),
         is_metroidvania = ifelse(str_detect(steamspy_tags, "Metroidvania"), TRUE, FALSE),
         is_visual_novel = ifelse(str_detect(steamspy_tags, "Visual Novel"), TRUE, FALSE),
         has_great_soundtrack = ifelse(str_detect(steamspy_tags, "Great Soundtrack"), TRUE, FALSE),
         is_shooter =ifelse(str_detect(steamspy_tags, "Shooter") | str_detect(steamspy_tags, "FPS") | str_detect(steamspy_tags, "Third-Person Shooter"), TRUE, FALSE),
         is_rpg = ifelse(str_detect(steamspy_tags, "RPG"), TRUE, FALSE),
         has_female_protagonist = ifelse(str_detect(steamspy_tags, "Female Protagonist"), TRUE, FALSE),
         is_sports = ifelse(str_detect(steamspy_tags, "Sports"), TRUE, FALSE),
         is_simulation = ifelse(str_detect(steamspy_tags, "Simulation"), TRUE, FALSE),
                                .after = multiplayer) 

```
```{r}
steam <- steam %>% 
  mutate(is_positive = ifelse(str_detect(general_rating, "Positive"), TRUE, FALSE),
         .after = linux_support)
```
For the purpose of my model over in Python land, i'm turning Multiplayer into just a logical to see if it changes anything

```{r}
steam %>% 
  distinct(multiplayer)
```


```{r}
steam <- steam %>% 
  mutate(multiplayer = ifelse(str_detect(multiplayer, "No multiplayer"), FALSE, TRUE)) 
```
```{r}
#ok fine im adding indie to see what happens 
steam <- steam %>% 
  mutate(is_indie = ifelse(str_detect(steamspy_tags, "Indie"), TRUE, FALSE)) 
```
```{r}
steam %>% 
  filter(multiplayer == FALSE)
```
```{r}
# what if we bumped up the threshold? What if mostly positive was no longer acceptable?

steam_but_harsher <- steam %>% 
  mutate(is_positive = case_when(
    general_rating == "Extremely Positive" | general_rating == "Positive" ~ TRUE,
    TRUE ~ FALSE
  )) 
```

```{r}
steam_but_harsher %>% 
  filter(is_positive == TRUE)
```
```{r}
steam %>% 
  filter(is_positive == TRUE)
```


```{r}
write_csv(steam, "clean_data/steam_for_model.csv")
write_csv(steam_but_harsher, "clean_data/harsher_steam_for_model.csv")
```

____________________________________

# For now, im gonna do some text stuff using backloggd reviews

```{r}
backloggd_reviews <- read_csv("clean_data/backloggd_reviews.csv")
```

```{r}
# separating reviews into their own columns so i can manage them
reviews_separated <- backloggd_reviews %>% 
  separate_longer_delim(cols = "reviews", delim = "',\ \'") %>% 
  separate_longer_delim(cols = "reviews", delim = "\',\ \"") %>% 
  separate_longer_delim(cols = "reviews", delim = "\",\ \'") %>%  
  separate_longer_delim(cols = "reviews", delim = "\",\ \"") %>% 
  unique() %>% 
  # removing all the opening and closing rubbish
  mutate(reviews = str_remove_all(string = reviews, pattern = "\\[\""),
         reviews = str_remove_all(string = reviews, pattern = "\\[\'"),
         reviews = str_remove_all(string = reviews, pattern = "\"\\]"),
         reviews = str_remove_all(string = reviews, pattern = "\'\\]")) 

```

```{r}
reviews_separated %>% 
  group_by(title) %>% 
  count(title)
```

# lets split this into ratings, like i did with the other backloggd thing
```{r}
reviews_separated <- reviews_separated %>% 
  mutate(rating_range = case_when(
    rating < 5 & rating >= 4 ~ "4+",
    rating < 4 & rating >= 3 ~ "3 to 4",
    rating < 3 & rating >= 2 ~ "2 to 3",
    rating < 2 & rating >= 1 ~ "1 to 2",
    rating < 1 ~ ">1",
    TRUE ~ "No rating"
  ))
```

Before doing text analysis, lets convert everything to lower case - this might take away some context via capitalisation, but we only have
6 reviews per game so gotta work with what we have

```{r}
reviews_lower_separated <- reviews_separated %>% 
  mutate(reviews = tolower(reviews)) #%>% 
  #filter(reviews %in% c("í", "ñ", "é", "á", "ó", "á") == FALSE)
```

```{r}
library(textdata)
library(tidytext)
```

```{r}
de_stopwords <- tibble(word = stopwords("de"))

reviews_3_to_5_stop <- reviews_lower_separated %>% 
  filter(rating_range == "4+" | rating_range == "3 to 4") %>% 
  select(-summary) %>% 
  unnest_tokens(input = reviews, output = word) %>% 
  anti_join(filter(stop_words, lexicon == "SMART")) %>% 
  anti_join(game_stop_words) %>% 
  anti_join(pt_stopwords) %>% 
  anti_join(de_stopwords) %>% 
  count(word, sort = TRUE)
  
```

```{r}
ggwordcloud(words = reviews_3_to_5_stop$word, freq = reviews_3_to_5_stop$n, random.color = TRUE, 
            colors =  c("#ff595e", "#ffca3a", "#8ac926", "#1982c4", "#6a4c93"), min.freq = 10)
```

```{r}
reviews_3_to_5_stop
```
```{r}
reviews_3_to_lowest_stop <- reviews_lower_separated %>% 
  filter(rating_range == "2 to 3" | rating_range == "1 to 2" | rating_range == ">1") %>% 
  select(-summary) %>% 
  unnest_tokens(input = reviews, output = word) %>% 
  anti_join(filter(stop_words, lexicon == "SMART")) %>% 
  anti_join(game_stop_words) %>% 
  anti_join(pt_stopwords) %>% 
  anti_join(de_stopwords) %>% 
  count(word, sort = TRUE)

```

```{r}
ggwordcloud(words = reviews_3_to_lowest_stop$word, freq = reviews_3_to_lowest_stop$n, random.color = TRUE, 
            colors =  c("#ff595e", "#ffca3a", "#8ac926", "#1982c4", "#6a4c93"), min.freq = 5)
```
bigrams might be more appropriate for bad reviews

```{r}
bigrams_3_to_lowest_stop <- reviews_lower_separated %>% 
  filter(rating_range == "2 to 3" | rating_range == "1 to 2" | rating_range == ">1") %>% 
  unnest_tokens(bigram, reviews, token = "ngrams", n = 2) %>% 
  count(bigram, sort = TRUE) %>%  # count bigrams
  separate(bigram, into = c("word1", "word2"), sep = " ") %>% # split bigrams into two seperate columns
  anti_join(stop_words, join_by("word1" == "word")) %>%  # check if word1 of bigram is a stop word
  anti_join(stop_words, join_by("word2" == "word")) %>% 
  anti_join(pt_stopwords, join_by("word1" == "word")) %>% 
  anti_join(pt_stopwords, join_by("word2" == "word")) %>% 
  anti_join(game_stop_words, join_by("word1" == "word")) %>% 
  anti_join(game_stop_words, join_by("word2" == "word")) %>% 
  anti_join(de_stopwords, join_by("word1" == "word")) %>% 
  anti_join(de_stopwords, join_by("word2" == "word")) %>% 
  drop_na() %>% 
  unite(col = "bigram", word1:word2, sep = " ", remove = TRUE) %>% 
  anti_join(game_stop_bigrams)
```

```{r}
ggwordcloud(words = bigrams_3_to_lowest_stop$bigram, freq = bigrams_3_to_lowest_stop$n, random.color = TRUE, 
            colors =  c("#ff595e", "#ffca3a", "#8ac926", "#1982c4", "#6a4c93"), min.freq = 1)
```
well fine lets remove game names as well

ok that still kinda sucks

```{r}
game_stop_bigrams <- reviews_backloggd %>% 
  select(title) %>% 
  unnest_tokens(bigram, title, token = "ngrams", n = 2) %>% 
  drop_na()
```


```{r}
reviews_3_to_lowest_stop
```
```{r}
game_stop_words <- tibble(
  word = c("game", "gameplay", "time", "played", "play", "de", "playing", "jogo", "é", "games") # lets try doing n-grams before populating this
)
```


```{r}
library(stopwords)

es_stopwords <- tibble(word = stopwords("es"))
pt_stopwords <- tibble(word = stopwords("pt"))
```


```{r}
bigrams_3_to_5_no_stop <- reviews_lower_separated %>% 
  filter(rating_range == "4+" | rating_range == "3 to 4") %>% 
  unnest_tokens(bigram, reviews, token = "ngrams", n = 2) %>% 
  count(bigram, sort = TRUE) %>%  # count bigrams
  separate(bigram, into = c("word1", "word2"), sep = " ") %>% # split bigrams into two seperate columns
  anti_join(stop_words, join_by("word1" == "word")) %>%  # check if word1 of bigram is a stop word
  anti_join(stop_words, join_by("word2" == "word")) %>% 
  anti_join(pt_stopwords, join_by("word1" == "word")) %>% 
  anti_join(pt_stopwords, join_by("word2" == "word")) %>% 
  anti_join(game_stop_words, join_by("word1" == "word")) %>% 
  anti_join(game_stop_words, join_by("word2" == "word")) %>% 
  drop_na() %>% 
  unite(col = "bigram", word1:word2, sep = " ", remove = TRUE)

bigrams_3_to_5_w_stop <- reviews_lower_separated %>% 
  filter(rating_range == "4+" | rating_range == "3 to 4") %>% 
  unnest_tokens(bigram, reviews, token = "ngrams", n = 2) %>%  # create bigrams
  count(bigram, sort = TRUE) 

```


```{r}
  ggwordcloud(words = bigrams_3_to_5_no_stop$bigram, freq = bigrams_3_to_5_no_stop$n, random.color = TRUE, colors =  c("#ff595e", "#ffca3a", "#8ac926", "#1982c4", "#6a4c93"))
```

ok what if we try some of that tf-idf stuff

```{r}
reviews_separated %>% 
  unnest_tokens(input = reviews, output = word)
```
``
